clear all
capture log close	

cd "projectfolder"

*************************
* PREPARE ANALYSIS SAMPLE 
*************************

********************************************************************************
* Open data 
********************************************************************************
// Load processed UKB phenotypic data 	
use "ExtractedData.dta", clear
// Merge with PGS
merge 1:1 ID using "PGS_ldpred_plink_EA_height_cvd_bmi.dta", gen(_scores)
keep if _scores==3 // 104 obs dropped
// Merge sibling identifiers & ancestry 
merge 1:1 ID using "Relatedness_to_siblings_UKB.dta", gen(_merge2)
merge 1:1 ID using "ancestry.dta", gen(_merge3)
// Remove those who withdrew consent
merge 1:1 ID using "w41382_20200820_withdrew_consent.dta", gen(_mergeconsent)
drop if _mergeconsent==3
*(98 observations deleted)
count

********************************************************************************
* Working Sample & Variables 
********************************************************************************
// Keep Europeans 
keep if european==1 // 92,892 observations deleted
count

// Keep working variables
keep ID famid EA ///
     c_numberbrothers c_numbersisters c_numberoldersiblings c_mumsmokingbirth e_educage e_FIscore* ///
	 c_hhinc_* c_agefirstbirth c_breastfed MoB YoB e_PC* sex h_DoD h_AoD ///
	 RL relid FS PC pcid relationship c_multiplebirth h_bw ///
	 c_motherage_0 c_motherage_1 c_motherage_2 c_fatherage_0 c_fatherage_1 c_fatherage_2 ///
	 h_date_0 h_date_1 h_date_2 EA_new ea_new_ukb_ld ea_new_ukb_ld_0 ea_new_ukb_ld_1 ea_new_23me_ukb_ld ea_23me_ld


// Family Structure & Rank  
* Prepare the sibling variables
rename c_numberbrothers n_brothers
rename c_numbersisters n_sisters
rename c_numberoldersiblings n_older_siblings

global vars n_brothers n_sisters n_older_siblings
foreach i in $vars {
	replace `i' = . if `i' < 0
	}
	
replace n_older_siblings = 0 if (n_brothers==0 & n_sisters==0) 

gen family_size = n_brothers + n_sisters + 1
gen rank = n_older_siblings + 1
sum rank 
sum rank if famid!=.
tab rank 

* Drop those where the number of older siblings is larger than family size
sum ID if (n_older_siblings > family_size & n_older_siblings!=. & family_size!=.)
drop if (n_older_siblings > family_size & n_older_siblings!=. & family_size!=.)
*(59 observations deleted)

* Imputing missing rank based on famid, ID, YoB, family_size
xtset famid ID

* Drop twins 
gen twin = ((YoB==YoB[_n+1] & MoB==MoB[_n+1] & famid==famid[_n+1]) | (YoB==YoB[_n-1] & MoB==MoB[_n-1] & famid==famid[_n-1])) & famid~=.
drop if twin==1 | c_multiplebirth==1
drop twin c_multiplebirth // 9,310 observations deleted

* Remove the instances for which the reported family_size is not consistent within a family_size within famid
egen Max_family=max(family_size) if famid!=.& family_size!=., by(famid)
egen Min_family=min(family_size) if famid!=.& family_size!=., by(famid)
gen Dif_family=Max_family-Min_family if famid!=.& family_size!=.
tab Dif_family
drop if Dif_family>0 & famid!=. // 3,530 real changes made
drop Max_family Min_family Dif_family

** Ordering siblings based on their birth year 
sort famid YoB MoB
bysort famid: gen order=_n

** Impute family_size for siblings without this info (robustness check for between family results with and without imputation)
clonevar family_size_imp=family_size
bys famid: egen temp = min(family_size)
replace family_size_imp = temp if family_size==.
drop temp

** Computing number of members in each family and dropping families with only 1 member even if they report their rank because they drop anyways when doing the FE
egen N_famid=count(famid) if famid!=., by(famid)
tab N_famid

*********************************************************************************************************
	** RANK IMPUTATIONS
*********************************************************************************************************
	
* Imputations are done if family_size is equal to the number of participating siblings in the UKB 
* Still keep in mind that these are based on participation of other siblings in the Biobank
	
	** Imputation of missing rank on basis of siblings who have info on older siblings
	replace rank = 1 if order==1 & n_older_siblings[_n+1]==1 & (famid[_n+1]==famid)

	** imputation of missing ranks for individuals for whom all siblings are in the biobank
	replace rank=order if famid!=. & rank==. & family_size==N_famid

	** Imputation of missing rank based on siblings (whose rank we know) and year-month of birth
	bys famid: gen miss = missing(rank)
	bys famid: egen rmin = min(miss)
	bys famid: egen rmax = max(miss)
	sort famid YoB MoB
	
	replace rank = 1 if rank==. & (rmin==0 & rmax==1) & (rank[_n+1]==2) & (famid[_n+1]==famid)
	replace rank = 2 if rank==. & (rmin==0 & rmax==1) & (rank[_n-1]==1) & (famid[_n-1]==famid) & family_size==2
	replace rank = 3 if rank==. & (rmin==0 & rmax==1) & (rank[_n-1]==2) & (famid[_n-1]==famid) & family_size==3
	replace rank = 4 if rank==. & (rmin==0 & rmax==1) & (rank[_n-1]==3) & (famid[_n-1]==famid) & family_size==4
	replace rank = 5 if rank==. & (rmin==0 & rmax==1) & (rank[_n-1]==4) & (famid[_n-1]==famid) & family_size==5
	replace rank = 6 if rank==. & (rmin==0 & rmax==1) & (rank[_n-1]==5) & (famid[_n-1]==famid) & family_size==6
	replace rank = 7 if rank==. & (rmin==0 & rmax==1) & (rank[_n-1]==6) & (famid[_n-1]==famid) & family_size==7
	replace rank = 8 if rank==. & (rmin==0 & rmax==1) & (rank[_n-1]==7) & (famid[_n-1]==famid) & family_size==8
	
	* the following added for completeness
	replace rank = 9 if rank==. & (rmin==0 & rmax==1) & (rank[_n-1]==8) & (famid[_n-1]==famid) & family_size==9
	replace rank = 10 if rank==. & (rmin==0 & rmax==1) & (rank[_n-1]==9) & (famid[_n-1]==famid) & family_size==10
	replace rank = 11 if rank==. & (rmin==0 & rmax==1) & (rank[_n-1]==10) & (famid[_n-1]==famid) & family_size==11
	replace rank = 12 if rank==. & (rmin==0 & rmax==1) & (rank[_n-1]==11) & (famid[_n-1]==famid) & family_size==12
	replace rank = 13 if rank==. & (rmin==0 & rmax==1) & (rank[_n-1]==12) & (famid[_n-1]==famid) & family_size==13
	replace rank = 14 if rank==. & (rmin==0 & rmax==1) & (rank[_n-1]==13) & (famid[_n-1]==famid) & family_size==14	
	replace rank = 15 if rank==. & (rmin==0 & rmax==1) & (rank[_n-1]==14) & (famid[_n-1]==famid) & family_size==15	
	drop miss rmin rmax
	
	** Correcting few siblings who report being the same (non-missing) rank
	* Note: Reassign rank based on YoB and MoB only when both sibs say they are rank 1 or 2 & family_size<=2
	* 	We can't do this if they are rank 3+ (e.g. we don't know wether they are rank 2-3 or 3-4)
	bys famid: gen miss = missing(rank)
	bys famid: egen rmin = min(miss)
	bys famid: egen rmax = max(miss)
	bys famid: egen max=max(rank) if rmin==0 & rmax==0 	// create max if rank is non-missing within famid
	bys famid: egen min=min(rank) if rmin==0 & rmax==0 	// create min if rank is non-missing within famid
	gen diff = max-min
	tab min max, m

	tab order if diff==0 			
	* Only reassign if diff==0 as they are the ones where siblings self-recorded the same rank
	* Within diff==0, order = 1 or 2. 
	forvalues i = 1/2 {
		replace rank = `i' if (order==`i' & inlist(family_size,1,2) & diff==0)
		}
	*replace rank = . if (rank==1 & order ~= 1)
	drop miss rmin rmax min max diff 



*********************************************************************************************************
** DROP PEOPLE WITH THE SAME RANK IN THE SAME FAMILY & RANK > FAMILY SIZE
*********************************************************************************************************
gen flag = (rank==rank[_n+1] & famid==famid[_n+1] & rank~=. & famid~=.) 
bys famid: egen maxflag = max(flag)
drop flag
drop if maxflag==1
* 87 observations deleted

* Drop those with rank > family_size
tab ID if (rank>family_size & rank!=. & family_size!=.)
drop if (rank>family_size & rank!=. & family_size!=.)
* 126 observations deleted


// Siblings age gap (largest difference in age between family members in the biobank) 
egen minYoB=min(YoB), by (famid)
gen age_gap=YoB-minYoB if YoB!=. & famid!=.
egen maxYoB=max(YoB), by (famid)
gen largest_age_gap=maxYoB-minYoB if YoB!=. & famid!=.
drop minYoB maxYoB

* Categorizing into firstborn vs laterborn 
gen laterborn=.
replace laterborn=0 if rank==1
replace laterborn=1 if rank>1 & rank!=.

gen firstborn=.
replace firstborn=1 if rank==1
replace firstborn=0 if rank>1 & rank!=.

* Categorizing family size and birth rank 
sum rank if rank>5 & rank!=.
rename rank full_rank 
clonevar rank=full_rank 
replace rank=5 if full_rank>4 & full_rank!=.

* Last child dummy  
gen last_child = .
replace last_child = 1 if full_rank == family_size 
replace last_child = 0 if full_rank < family_size
replace last_child = 0 if full_rank < N_famid
replace last_child=. if full_rank==. | (family_size==. & N_famid == .)
sum last_child if famid!=.


sort famid YoB MoB	
order ID famid sex YoB MoB family_size rank full_rank order firstborn laterborn EA e_educage last_child age_gap largest_age_gap
*drop n_brothers n_sisters n_older_siblings h_DoD h_AoD RL PC pcid relid


// SUM STATS
* Labelling of the variables in use
label variable EA_new "Educational attainment in years"
label variable firstborn "dummy for being the first born among the siblings"
label variable rank "birth order based on the number of older siblings in the family"
label variable family_size "based on the total number of siblings + 1"
label variable last_child "dummy for being the last child among the siblings"
label variable age_gap "difference in age from the oldest sibling"
label variable sex "self-reported gender"

count
sum firstborn
********************************************************************************
*** Checking selection in our sample for Revision 2024
********************************************************************************
global controls sex i.YoB i.MoB e_PC* family_size_imp

reg EA_new firstborn $controls
eststo large_sample

reg EA_new firstborn $controls if family_size_imp>1 // siblings 
fsum EA_new e_educage n_brothers n_sisters n_older_siblings MoB YoB sex if e(sample), format(%12.3f) 

label variable EA_new "Educational attainment in years"
label variable firstborn "dummy for being the first born among the siblings"
label variable rank "birth order based on the number of older siblings in the family"
label variable family_size "based on the total number of siblings + 1"
label variable last_child "dummy for being the last child among the siblings"
label variable age_gap "difference in age from the oldest sibling"
label variable sex "self-reported gender"

fsum EA_new firstborn ea_23me_ld rank family_size_imp last_child sex, format(%12.3f) // cannot compare ukb pgs
fsum EA_new firstborn ea_23me_ld rank family_size_imp last_child sex if famid!=., format(%12.3f) // cannot compare ukb pgs


*Siblings sample 
reg famid EA_new firstborn family_size_imp ea_new_ukb_ld rank last_child age_gap sex i.MoB i.YoB e_PC_1-e_PC_40
keep if e(sample)
count

*** KEEP ONLY SIBLINGS WITH AT LEAST ONE ANOTHER SIB
drop if family_size == 1
*(4 observations deleted)
tab family_size
* Only gives family_size>2
count 

** Recomputing number of members in each family and dropping families with only 1 member
egen N_famid2=count(famid), by(famid)
tab N_famid2
drop if N_famid2==1
count

*** Table C.1 Results of the regressions of years of education on being firstborn is the analysis sample and in an expanded sample.

xtset famid ID
xtreg  EA_new i.firstborn ea_new_ukb_ld firstborn#c.ea_new_ukb_ld $controls, cluster(famid) fe
eststo main 
xtreg  EA_new i.firstborn $controls, cluster(famid) fe
eststo analysis_sample_wf
reg EA_new firstborn $controls
eststo analysis_sample

esttab main analysis_sample_wf large_sample, noomitted nobase se star(* 0.10 ** 0.05 *** 0.01) ///
	   stats(N r2) drop(e_PC_* *.MoB *.YoB sex family_size_imp ) title("Sample selection in birth order effects") ///
	   se(3) b(3) ///
	   mtitles("Main" "Analysis sample" "Expanded") ///
	   addnotes("Excludes non-Europeans" "Controls for year and month of birth, being last child, gender and 40 first PCs") 

fsum EA_new firstborn ea_23me_ld rank family_size_imp last_child sex, format(%12.3f)

********************************************************************************
*** End of checking selection in our sample 
********************************************************************************	   

** Parental age 
sum c_motherage_0 c_motherage_1 c_motherage_2 c_fatherage_0 c_fatherage_1 c_fatherage_2
gen motherage = . 
replace motherage = c_motherage_0
replace motherage = c_motherage_1 if c_motherage_0 == .
replace motherage = c_motherage_2 if c_motherage_1 == .
* parental cohorts can be identified by substacting the age of parent from the date of attending the assessment center,
* better measure of cohort than age, because of difference in the dates of attendign the assessment center 

*Father's age 
gen fatherage = . 
replace fatherage = c_fatherage_0
replace fatherage = c_fatherage_1 if c_fatherage_0 == .
replace fatherage = c_fatherage_2 if c_fatherage_1 == .

sum motherage fatherage if famid!=.

count // should be 14,850

fsum EA_new e_educage n_brothers n_sisters n_older_siblings MoB YoB sex, format(%12.3f)
* 14,850

save "analysis_data_organized", replace

